In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [2]:
metadata = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/movies_metadata.csv', low_memory=False)
links_small = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/links_small.csv')

In [3]:
links_small.head(2)


Out[3]:
movieId imdbId tmdbId
0 1 114709 862.0
1 2 113497 8844.0

In [4]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

metadata = metadata.drop([19730, 29503, 35587])

In [5]:
metadata['id'] = metadata['id'].astype('int')

# Collect all the metadata for movies in the links_small dataset
smd = metadata[metadata['id'].isin(links_small)]

In [6]:
# Fill empty spaces and create new description column
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [7]:
# Convert descriptions into a corpus then weigh the individual words using tfidf
# https://www.quora.com/How-does-TfidfVectorizer-work-in-laymans-terms

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

tfidf_matrix.shape


Out[7]:
(9099, 268124)

In [8]:
# Cosine Similarity

# I will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. 
# Mathematically, it is defined as follows:

# cosine(x,y)=x.y⊺||x||.||y||
 
# Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score. 
# Therefore, we will use sklearn's linear_kernel instead of cosine_similarities since it is much faster.

In [9]:
# Calculate the cosine similarity f each movies description to another in the dataset
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim[0]


Out[9]:
array([ 1.        ,  0.00680476,  0.        , ...,  0.        ,
        0.00344913,  0.        ])

In [10]:
# Create a new series based on movie titles using the similarity matrix
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [11]:
# Get the most similar movies using their titles
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [12]:
get_recommendations('Cars').head(10)


Out[12]:
7814               Cars 2
1829      Bride of Chucky
3564          Bagdad Cafe
4937            Silverado
2391        On Any Sunday
3383               Driven
3473    Cannonball Run II
2069     Cookie's Fortune
5454            The Clock
871        The Great Race
Name: title, dtype: object

In [13]:
## NEW METHOD

# Import new data then change the ids to integers
credits = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/credits.csv')
keywords = pd.read_csv('/Users/anthonymiyoro/Documents/code/MoviePredictor/data/keywords.csv')

# Change all ids to integers
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [14]:
# Merge cast, crew, genres and credits into one dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [15]:
metadata.shape


Out[15]:
(46628, 27)

In [16]:
# Merge credits and keywords to metadata in the smaller dataset
smd = metadata[metadata['id'].isin(links_small)]
smd.shape


Out[16]:
(9219, 27)

In [17]:
# From the crew we will only pick the director as a feature.
# From the cast, we will only pick the first 3 mentioned as we assume that they are the most influential

In [18]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [ ]:


In [19]:
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C


Out[19]:
5.238696808510638

In [20]:
m = vote_counts.quantile(0.95)
m


Out[20]:
425.0

In [21]:
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [22]:
qualified = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape


Out[22]:
(2335, 6)

In [23]:
# def weighted_rating(x):
#     v = x['vote_count']
#     R = x['vote_average']
#     return (v/(v+m) * R) + (m/(m+v) * C)

In [24]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-a4cac61d7166> in <module>()
----> 1 qualified['wr'] = qualified.apply(weighted_rating, axis=1)

NameError: name 'weighted_rating' is not defined

In [ ]:
# List movies with the highest weighted ratings
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [ ]:


In [ ]:
# Function that collects directors name
# def get_director(x):
#     for i in x:
#         if i['job'] == 'Director':
#             return i['name']
#     return np.nan

In [ ]:
# Create new column that will hold the diroctors name
smd['director'] = smd['crew'].apply(get_director)

In [ ]:
# Collect the first 3 cast members
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [ ]:
# For each movie in the dataset, there will be a metadata dump in which 
# all the genres, director, main actors and keywords. There will then be a count matrix from a count vectoriser with 
# which we calculate the cosine similarities and return movies that are most similar. 

# For the genre and credit data, we will strip spaces and convert to lowercase. We will also mention the director 3 
# times to increase its weighting to that above the cast.

smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [ ]:
# For the keywords

In [ ]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [ ]:
s = s.value_counts()
s[:5]

In [ ]:
s = s[s > 1]

stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

In [ ]:
# def filter_keywords(x):
#     words = []
#     for i in x:
#         if i in s:
#             words.append(i)
#     return words

I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) = (vv+m.R)+(mv+m.C)

where,

v is the number of votes for the movie m is the minimum votes required to be listed in the chart R is the average rating of the movie C is the mean vote across the whole report The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!


In [ ]:
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

In [ ]:
m = vote_counts.quantile(0.95)
m

In [ ]:
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [ ]:
qualified = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

In [ ]:
# I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:
# Weighted Rating (WR) = (vv+m.R)+(mv+m.C) where, v is the number of votes for the movie m is the minimum votes required to be listed in the chart R is the 
# average rating of the movie C is the mean vote across the whole report The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. 
# We will use 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list. 
# I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!


# def weighted_rating(x):
#     v = x['vote_count']
#     R = x['vote_average']
#     return (v/(v+m) * R) + (m/(m+v) * C)

In [ ]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [ ]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['keywords'] + smd['cast'] +  smd['director'] 
# smd['soup'] =  + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [ ]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [ ]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [ ]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [ ]:
get_recommendations('The Dark Knight').head(6)

In [ ]:
# From our results, we can see that we need to remove the bad movies (those that have low ratings). 
# I will take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. 
# Then, using this as the value of m, we will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [ ]:
# Returns a dataframe ??
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(6)
    return qualified

In [ ]:
smd.columns

In [ ]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['keywords'] + smd['cast'] +  smd['director'] 
# smd['soup'] =  + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [ ]:
improved_recommendations('Casino')